#
# createLessGluedData.R
#
# Create a version of the ANES cumulative data with
# no items having constant item parameters longer than
# X years. This essentially shortens the distance of any
# glue for the scaling.
#
# Hill, Seth J. and Chris Tausanovitch. "A Disconnect in Representation? Comparison of Trends in Congressional and Public Polarization."
#

# Parameters.
max.distance <- 10 # maximum distance in years for item params to be constant
additional.vars.to.keep <- c("year"="VCF0004","respondent"="VCF0006",
                             "party"="VCF0301","weight"="VCF0009a")
exclude.pre.56 <- T # excludes items found only prior to 1956 from plot.

unGlue <- function(x,max.distance) {
  # Unglue a sequence of numbers into multiple sequences of distance less than max.distance.
  # Arguments.
  #  x - a sequence of numbers.
  #  max.distance - maximum distance within any of the new sequences.
  # Returns x named for which item sequence each element resides.
  breaks <- min(x)-1
  x <- sort(x) # sort
  max.x <- max(x,na.rm=T)
  breaks <- c(breaks, x[1] + max.distance) # first break
  # Maximum value of x within this break.
  max.in.break <- max(x[x <= breaks[length(breaks)]])
  while (max.in.break < max.x) {
    # Calculate and append next break.
    next.break <- min(x[x > breaks[length(breaks)]]) + max.distance
    breaks <- c(breaks,next.break)
    # Recalculate max.in.break. While loop breaks when max.in.break == max.x.
    max.in.break <- max(x[x <= breaks[length(breaks)]])
  }
  names(x) <- cut(x,breaks=breaks,labels=sprintf("Item %s",seq(1,length(breaks)-1)))
  return(x)
}

#print(unGlue(seq(1948,2012,by=4),max.distance=max.distance));cat("\n")
#print(unGlue(seq(1976,2012,by=2),max.distance=max.distance));cat("\n")
#print(unGlue(sample(seq(1948,2012,by=2),10),max.distance=max.distance));cat("\n")

# ANES cumulative file modified by create2012Data.R and mergeCrossSectionalItems.R.
load("anes_cdf_extra_50s60s.RData")

# Current list of items and set of years.
vars <- read.csv("ANES_Cum_Vars_ToUse.csv",as.is=T)
names(extra.vars.50s.60s) <- names(vars)[1:2]
vars <- rbind(vars[,1:2],extra.vars.50s.60s) # append extra vars.
library(data.table)
DT <- data.table(anes.cdf.with.2012)
# Aggregate number of non-missing obs by item and year.
num.miss <- DT[,lapply(.SD,function(x) sum(!is.na(x))),
                        .SDcols=vars$VarNumber,by="VCF0004"]

# Apply un-glueing algorithm to each item. When breaking,
# (1) create new item rows in unglued.vars data.frame, and 
# (2) create new item columns in anes.cdf.with.2012.
unglued.vars <- NULL
for (v in vars$VarNumber) {
  # Run unGlue().
  years <- num.miss[num.miss[[v]] != 0,VCF0004]
  res <- unGlue(years,max.distance)
  # Create new row in unglued.vars for each new coding of item.
  if ("Item 2" %in% names(res)) {
    for (i in unique(names(res))) { # i new codings of item
      new.row <- vars[vars$VarNumber == v,c("VarNumber","VarLabel")]
      new.row[,"VarNumber"] <- sprintf("%s%s",v,gsub("Item ",".",i)) # new item name appended with ".i"
      new.row$FirstYear <- min(res[which(names(res) == i)]) # first year of item
      unglued.vars <- rbind(unglued.vars,new.row)
    }
  } else { # no new coding
    new.row <- vars[vars$VarNumber == v,c("VarNumber","VarLabel")]
    new.row$FirstYear <- min(res) # first coding of item
    unglued.vars <- rbind(unglued.vars,new.row)
  }
  rm(new.row)
  # If applicable, create new columns in anes.cdf.with.2012.
  if ("Item 2" %in% names(res)) {
    for (i in unique(names(res))) { # i new codings of item
      anes.cdf.with.2012[[sprintf("%s%s",v,gsub("Item ",".",i))]] <- with(anes.cdf.with.2012,
                            # If respondent in year for coding i of item, include responses, else NA.
                            ifelse(VCF0004 %in% res[which(names(res) == i)],get(v),NA))
    }
  }
}

# Save out data and list of "new" items.
unglued.item.list <- unglued.vars$VarNumber
anes.cdf.with.2012 <- anes.cdf.with.2012[,c(unglued.vars$VarNumber,additional.vars.to.keep)]
# Dropping pre-1952 cases.
anes.cdf.with.2012 <- anes.cdf.with.2012[anes.cdf.with.2012$VCF0004 >= 1952,]

save(anes.cdf.with.2012,unglued.vars,
    file="anes_cdf_extra_50s60s_less_glue.RData")

#
# Plot item coverage by year.
#
cat("Number of non-missing observations by year:\n")
library(data.table)
DT <- data.table(anes.cdf.with.2012)
# Aggregate number of non-missing obs by item and year.
if (exclude.pre.56) {
  num.miss <- DT[VCF0004 > 1955,lapply(.SD,function(x) sum(!is.na(x))),
                        .SDcols=unglued.vars$VarNumber,by="VCF0004"]
  warning("Plotting only items appearing after 1955.\n")
} else {
  num.miss <- DT[,lapply(.SD,function(x) sum(!is.na(x))),
                        .SDcols=unglued.vars$VarNumber,by="VCF0004"]
}
unglued.vars <- data.table(unglued.vars)
setkeyv(unglued.vars,c("FirstYear","VarNumber"))
if (exclude.pre.56) { # drop pre-1956 variables
  unglued.vars <- unglued.vars[FirstYear > 1955,]
}
items <- rev(unglued.vars$VarNumber)
pdf("graphs/Cumulative_Map_LessGlued.pdf",width=10,height=8)
par.old <- par(mar=c(4.1,6.1,1.1,1.1))
plot(x=num.miss[,range(VCF0004)],y=c(1,length(items)),type='n',ann=F,axes=F)
axis(1,at=num.miss[,seq(min(VCF0004),max(VCF0004),by=2)])
axis(2,las=2,at=1:length(items),labels=items,cex.axis=.6,tick=F)
abline(h=1:length(items),col='gray')
for (i in 1:length(items)) {
  flag <- num.miss[[items[i]]] != 0
  points(x=num.miss[flag,VCF0004],y=rep(i,sum(flag)),pch=19)
}
par(par.old)
dev.off()
cat("With",max.distance,"year maximum distance,",length(items),"unique items in estimation.\n")
